# spamPath = system.file(package = 'RSpamData') spamPath =
# '/Users/nolan/RPackages/RSpamData'spamPath = '.'
spamPath = "../Data/spam/"
# list.dirs(spamPath, full.names = FALSE)
list.files(path = paste(spamPath, "messages", sep = .Platform$file.sep))
#> [1] "easy_ham" "easy_ham_2" "hard_ham" "spam" "spam_2"
f = list.files(path = paste(spamPath, "messages/spam", sep = .Platform$file.sep))
length(f)
#> [1] 1001
head(list.files(path = paste(spamPath, "messages", "spam_2", sep = .Platform$file.sep)))
#> [1] "00001.317e78fa8ee2f54cd4890fdc09ba8176"
#> [2] "00002.9438920e9a55591b18e60d1ed37d992b"
#> [3] "00003.590eff932f8704d8b0fcbe69d023b54d"
#> [4] "00004.bdcc075fa4beb5157b5dd6cd41d8887b"
#> [5] "00005.ed0aba4d386c5e62bc737cf3f0ed9589"
#> [6] "00006.3ca1f399ccda5d897fecb8c57669a283"
dirNames = list.files(path = paste(spamPath, "messages", sep = .Platform$file.sep))
length(list.files(paste(spamPath, "messages", dirNames, sep = .Platform$file.sep)))
#> [1] 9353
sapply(paste(spamPath, "messages", dirNames, sep = .Platform$file.sep), function(dir) length(list.files(dir)))
#> ../Data/spam//messages/easy_ham ../Data/spam//messages/easy_ham_2
#> 5052 1401
#> ../Data/spam//messages/hard_ham ../Data/spam//messages/spam
#> 501 1001
#> ../Data/spam//messages/spam_2
#> 1398
fullDirNames = paste(spamPath, "messages", dirNames, sep = .Platform$file.sep)
fileNames = list.files(fullDirNames[1], full.names = TRUE)
fileNames[1]
#> [1] "../Data/spam//messages/easy_ham/00001.7c53336b37003a9286aba55d2945844c"
msg = readLines(fileNames[1])
head(msg)
#> [1] "From exmh-workers-admin@redhat.com Thu Aug 22 12:36:23 2002"
#> [2] "Return-Path: <exmh-workers-admin@spamassassin.taint.org>"
#> [3] "Delivered-To: zzzz@localhost.netnoteinc.com"
#> [4] "Received: from localhost (localhost [127.0.0.1])"
#> [5] "\tby phobos.labs.netnoteinc.com (Postfix) with ESMTP id D03E543C36"
#> [6] "\tfor <zzzz@localhost>; Thu, 22 Aug 2002 07:36:16 -0400 (EDT)"
indx = c(1:5, 15, 27, 68, 69, 329, 404, 427, 516, 852, 971)
fn = list.files(fullDirNames[1], full.names = TRUE)[indx]
sampleEmail = sapply(fn, readLines)
msg = sampleEmail[[1]]
which(msg == "")[1]
#> [1] 63
#> [1] 63
splitPoint = match("", msg)
msg[(splitPoint - 2):(splitPoint + 6)]
#> [1] "List-Archive: <https://listman.spamassassin.taint.org/mailman/private/exmh-workers/>"
#> [2] "Date: Thu, 22 Aug 2002 18:26:25 +0700"
#> [3] ""
#> [4] " Date: Wed, 21 Aug 2002 10:54:46 -0500"
#> [5] " From: Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>"
#> [6] " Message-ID: <1029945287.4797.TMDA@deepeddy.vircio.com>"
#> [7] ""
#> [8] ""
#> [9] " | I can't reproduce this error."
header = msg[1:(splitPoint - 1)]
body = msg[-(1:splitPoint)]
splitMessage = function(msg) {
splitPoint = match("", msg)
header = msg[1:(splitPoint - 1)]
body = msg[-(1:splitPoint)]
return(list(header = header, body = body))
}
sampleSplit = lapply(sampleEmail, splitMessage)
header = sampleSplit[[1]]$header
grep("Content-Type", header)
#> [1] 46
grep("multi", tolower(header[46]))
#> integer(0)
#> [1] "Content-Type: text/plain; charset=us-ascii"
headerList = lapply(sampleSplit, function(msg) msg$header)
CTloc = sapply(headerList, grep, pattern = "Content-Type")
CTloc
#> $`../Data/spam//messages/easy_ham/00001.7c53336b37003a9286aba55d2945844c`
#> [1] 46
#>
#> $`../Data/spam//messages/easy_ham/00002.9c4069e25e1ef370c078db7ee85ff9ac`
#> [1] 45
#>
#> $`../Data/spam//messages/easy_ham/00003.860e3c3cee1b42ead714c5c874fe25f7`
#> [1] 42
#>
#> $`../Data/spam//messages/easy_ham/00004.864220c5b6930b209cc287c361c99af1`
#> [1] 30
#>
#> $`../Data/spam//messages/easy_ham/00005.bf27cdeaf0b8c4647ecd61b1d09da613`
#> [1] 44
#>
#> $`../Data/spam//messages/easy_ham/00014.cb20e10b2bfcb8210a1c310798532a57`
#> [1] 54
#>
#> $`../Data/spam//messages/easy_ham/00025.d685245bdc4444f44fa091e6620b20b3`
#> integer(0)
#>
#> $`../Data/spam//messages/easy_ham/00062.009f5a1a8fa88f0b38299ad01562bb37`
#> [1] 21
#>
#> $`../Data/spam//messages/easy_ham/00063.0acbc484a73f0e0b727e06c100d8df7b`
#> [1] 17
#>
#> $`../Data/spam//messages/easy_ham/0030.77828e31de08ebb58b583688b87524cc`
#> [1] 52
#>
#> $`../Data/spam//messages/easy_ham/00368.f86324a03e7ae7070cc40f302385f5d3`
#> [1] 31
#>
#> $`../Data/spam//messages/easy_ham/00389.8606961eaeef7b921ce1c53773248d69`
#> [1] 52
#>
#> $`../Data/spam//messages/easy_ham/0047.5c3e049737a2813d4ac6f13f02362fb1`
#> [1] 52
#>
#> $`../Data/spam//messages/easy_ham/00775.0e012f373467846510d9db297e99a008`
#> [1] 27
#>
#> $`../Data/spam//messages/easy_ham/00883.c44a035e7589e83076b7f1fed8fa97d5`
#> [1] 31
sapply(headerList, function(header) {
CTloc = grep("Content-Type", header)
if (length(CTloc) == 0)
return(NA)
CTloc
})
#> ../Data/spam//messages/easy_ham/00001.7c53336b37003a9286aba55d2945844c
#> 46
#> ../Data/spam//messages/easy_ham/00002.9c4069e25e1ef370c078db7ee85ff9ac
#> 45
#> ../Data/spam//messages/easy_ham/00003.860e3c3cee1b42ead714c5c874fe25f7
#> 42
#> ../Data/spam//messages/easy_ham/00004.864220c5b6930b209cc287c361c99af1
#> 30
#> ../Data/spam//messages/easy_ham/00005.bf27cdeaf0b8c4647ecd61b1d09da613
#> 44
#> ../Data/spam//messages/easy_ham/00014.cb20e10b2bfcb8210a1c310798532a57
#> 54
#> ../Data/spam//messages/easy_ham/00025.d685245bdc4444f44fa091e6620b20b3
#> NA
#> ../Data/spam//messages/easy_ham/00062.009f5a1a8fa88f0b38299ad01562bb37
#> 21
#> ../Data/spam//messages/easy_ham/00063.0acbc484a73f0e0b727e06c100d8df7b
#> 17
#> ../Data/spam//messages/easy_ham/0030.77828e31de08ebb58b583688b87524cc
#> 52
#> ../Data/spam//messages/easy_ham/00368.f86324a03e7ae7070cc40f302385f5d3
#> 31
#> ../Data/spam//messages/easy_ham/00389.8606961eaeef7b921ce1c53773248d69
#> 52
#> ../Data/spam//messages/easy_ham/0047.5c3e049737a2813d4ac6f13f02362fb1
#> 52
#> ../Data/spam//messages/easy_ham/00775.0e012f373467846510d9db297e99a008
#> 27
#> ../Data/spam//messages/easy_ham/00883.c44a035e7589e83076b7f1fed8fa97d5
#> 31
hasAttach = sapply(headerList, function(header) {
CTloc = grep("Content-Type", header)
if (length(CTloc) == 0)
return(FALSE)
grepl("multi", tolower(header[CTloc]))
})
hasAttach
#> ../Data/spam//messages/easy_ham/00001.7c53336b37003a9286aba55d2945844c
#> FALSE
#> ../Data/spam//messages/easy_ham/00002.9c4069e25e1ef370c078db7ee85ff9ac
#> FALSE
#> ../Data/spam//messages/easy_ham/00003.860e3c3cee1b42ead714c5c874fe25f7
#> FALSE
#> ../Data/spam//messages/easy_ham/00004.864220c5b6930b209cc287c361c99af1
#> FALSE
#> ../Data/spam//messages/easy_ham/00005.bf27cdeaf0b8c4647ecd61b1d09da613
#> FALSE
#> ../Data/spam//messages/easy_ham/00014.cb20e10b2bfcb8210a1c310798532a57
#> TRUE
#> ../Data/spam//messages/easy_ham/00025.d685245bdc4444f44fa091e6620b20b3
#> FALSE
#> ../Data/spam//messages/easy_ham/00062.009f5a1a8fa88f0b38299ad01562bb37
#> TRUE
#> ../Data/spam//messages/easy_ham/00063.0acbc484a73f0e0b727e06c100d8df7b
#> TRUE
#> ../Data/spam//messages/easy_ham/0030.77828e31de08ebb58b583688b87524cc
#> TRUE
#> ../Data/spam//messages/easy_ham/00368.f86324a03e7ae7070cc40f302385f5d3
#> TRUE
#> ../Data/spam//messages/easy_ham/00389.8606961eaeef7b921ce1c53773248d69
#> TRUE
#> ../Data/spam//messages/easy_ham/0047.5c3e049737a2813d4ac6f13f02362fb1
#> TRUE
#> ../Data/spam//messages/easy_ham/00775.0e012f373467846510d9db297e99a008
#> TRUE
#> ../Data/spam//messages/easy_ham/00883.c44a035e7589e83076b7f1fed8fa97d5
#> TRUE
header = sampleSplit[[6]]$header
boundaryIdx = grep("boundary=", header)
header[boundaryIdx]
#> [1] " boundary=\"==_Exmh_-1317289252P\";"
sub(".*boundary=\"(.*)\";.*", "\\1", header[boundaryIdx])
#> [1] "==_Exmh_-1317289252P"
header2 = headerList[[9]]
boundaryIdx2 = grep("boundary=", header2)
header2[boundaryIdx2]
#> [1] "Content-Type: multipart/alternative; boundary=Apple-Mail-2-874629474"
sub(".*boundary=\"(.*)\";.*", "\\1", header2[boundaryIdx2])
#> [1] "Content-Type: multipart/alternative; boundary=Apple-Mail-2-874629474"
boundary2 = gsub("\"", "", header2[boundaryIdx2])
sub(".*boundary= *(.*);?.*", "\\1", boundary2)
#> [1] "Apple-Mail-2-874629474"
boundary = gsub("\"", "", header[boundaryIdx])
sub(".*boundary= *(.*);?.*", "\\1", boundary)
#> [1] "==_Exmh_-1317289252P;"
sub(".*boundary= *([^;]*);?.*", "\\1", boundary)
#> [1] "==_Exmh_-1317289252P"
getBoundary = function(header) {
boundaryIdx = grep("boundary=", header)
boundary = gsub("\"", "", header[boundaryIdx])
gsub(".*boundary= *([^;]*);?.*", "\\1", boundary)
}
sampleSplit[[6]]$body
#> [1] "--==_Exmh_-1317289252P"
#> [2] "Content-Type: text/plain; charset=us-ascii"
#> [3] ""
#> [4] "> From: Chris Garrigues <cwg-exmh@DeepEddy.Com>"
#> [5] "> Date: Wed, 21 Aug 2002 10:40:39 -0500"
#> [6] ">"
#> [7] "> > From: Chris Garrigues <cwg-exmh@DeepEddy.Com>"
#> [8] "> > Date: Wed, 21 Aug 2002 10:17:45 -0500"
#> [9] "> >"
#> [10] "> > Ouch...I'll get right on it."
#> [11] "> > "
#> [12] "> > > From: Robert Elz <kre@munnari.OZ.AU>"
#> [13] "> > > Date: Wed, 21 Aug 2002 19:30:01 +0700"
#> [14] "> > >"
#> [15] "> > > Any chance of having that lengthen instead? I like all my exmh stuff"
#> [16] "> > > in nice columns (fits the display better). That is, I use the detache"
#> [17] "> d"
#> [18] "> > > folder list, one column. The main exmh window takes up full screen,"
#> [19] "> > > top to bottom, but less than half the width, etc..."
#> [20] "> "
#> [21] "> I thought about that. The first order approximation would be to just add "
#> [22] "> using pack .... -side top instead of pack ... -side left, however, since their "
#> [23] "> each a different width, it would look funny."
#> [24] ""
#> [25] "I've done this. It's not as pretty as I think it should be, but it works. "
#> [26] "I'm going to leave the cosmetic issues to others. When I update the "
#> [27] "documentation, I'll add this to the exmh.TODO file."
#> [28] ""
#> [29] "I'm leaving for a 2 1/2 week vacation in a week, so this is the last new "
#> [30] "functionality I'm going to add for a while. Also, I now have pretty much "
#> [31] "everything in there that I want for my own use, so I'm probably pretty much "
#> [32] "done. I'll work on bug fixes and documentation before my vacation, and "
#> [33] "hopefully do nothing more afterwards."
#> [34] ""
#> [35] "Chris"
#> [36] ""
#> [37] "-- "
#> [38] "Chris Garrigues http://www.DeepEddy.Com/~cwg/"
#> [39] "virCIO http://www.virCIO.Com"
#> [40] "716 Congress, Suite 200"
#> [41] "Austin, TX 78701\t\t+1 512 374 0500"
#> [42] ""
#> [43] " World War III: The Wrong-Doers Vs. the Evil-Doers."
#> [44] ""
#> [45] ""
#> [46] ""
#> [47] ""
#> [48] "--==_Exmh_-1317289252P"
#> [49] "Content-Type: application/pgp-signature"
#> [50] ""
#> [51] "-----BEGIN PGP SIGNATURE-----"
#> [52] "Version: GnuPG v1.0.6 (GNU/Linux)"
#> [53] "Comment: Exmh version 2.2_20000822 06/23/2000"
#> [54] ""
#> [55] "iD8DBQE9ZQJ/K9b4h5R0IUIRAiPuAJwL4mUus5whLNQZC8MsDlGpEdKNrACcDfZH"
#> [56] "PcGgN9frLIM+C5Z3vagi2wE="
#> [57] "=qJoJ"
#> [58] "-----END PGP SIGNATURE-----"
#> [59] ""
#> [60] "--==_Exmh_-1317289252P--"
#> [61] ""
#> [62] ""
#> [63] ""
#> [64] "_______________________________________________"
#> [65] "Exmh-workers mailing list"
#> [66] "Exmh-workers@redhat.com"
#> [67] "https://listman.redhat.com/mailman/listinfo/exmh-workers"
#> [68] ""
boundary = getBoundary(headerList[[15]])
body = sampleSplit[[15]]$body
bString = paste("--", boundary, sep = "")
bStringLocs = which(bString == body)
bStringLocs
#> [1] 2 35
eString = paste("--", boundary, "--", sep = "")
eStringLoc = which(eString == body)
eStringLoc
#> [1] 77
msg = body[(bStringLocs[1] + 1):(bStringLocs[2] - 1)]
tail(msg)
#> [1] ">" ">Yuck" "> " ">" "" ""
msg = c(msg, body[(eStringLoc + 1):length(body)])
tail(msg)
#> [1] "> " ">" "" "" "" ""
dropAttach = function(body, boundary) {
bString = paste("--", boundary, sep = "")
bStringLocs = which(bString == body)
if (length(bStringLocs) <= 1)
return(body)
eString = paste("--", boundary, "--", sep = "")
eStringLoc = which(eString == body)
if (length(eStringLoc) == 0)
return(body[(bStringLocs[1] + 1):(bStringLocs[2] - 1)])
n = length(body)
if (eStringLoc < n)
return(body[c((bStringLocs[1] + 1):(bStringLocs[2] - 1), ((eStringLoc +
1):n))])
return(body[(bStringLocs[1] + 1):(bStringLocs[2] - 1)])
}
head(sampleSplit[[1]]$body)
#> [1] " Date: Wed, 21 Aug 2002 10:54:46 -0500"
#> [2] " From: Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>"
#> [3] " Message-ID: <1029945287.4797.TMDA@deepeddy.vircio.com>"
#> [4] ""
#> [5] ""
#> [6] " | I can't reproduce this error."
msg = sampleSplit[[3]]$body
head(msg)
#> [1] "Man Threatens Explosion In Moscow "
#> [2] ""
#> [3] "Thursday August 22, 2002 1:40 PM"
#> [4] "MOSCOW (AP) - Security officers on Thursday seized an unidentified man who"
#> [5] "said he was armed with explosives and threatened to blow up his truck in"
#> [6] "front of Russia's Federal Security Services headquarters in Moscow, NTV"
#> [1] "Man Threatens Explosion In Moscow "
#> [2] "Thursday August 22, 2002 1:40 PM"
#> [3] "4 DVDs Free +s&p Join Now"
#> [4] "http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM"
cleanMsg = tolower(gsub("[[:punct:]0-9[:blank:]]+", " ", msg))
cleanMsg[c(1, 3, 26, 27)]
#> [1] "man threatens explosion in moscow "
#> [2] "thursday august pm"
#> [3] " dvds free s p join now"
#> [4] "http us click yahoo com pt ybb nxieaa mg haa gsolb tm"
library(tm)
stopWords = stopwords()
cleanSW = tolower(gsub("[[:punct:]0-9[:blank:]]+", " ", stopWords))
SWords = unlist(strsplit(cleanSW, "[[:blank:]]+"))
SWords = SWords[nchar(SWords) > 1]
stopWords = unique(SWords)
words = unlist(strsplit(cleanMsg, "[[:blank:]]+"))
words = words[nchar(words) > 1]
words = words[!(words %in% stopWords)]
head(words)
#> [1] "man" "threatens" "explosion" "moscow" "thursday" "august"
cleanText = function(msg) {
tolower(gsub("[[:punct:]0-9[:space:][:blank:]]+", " ", msg))
}
findMsgWords = function(msg, stopWords) {
if (is.null(msg))
return(character())
words = unique(unlist(strsplit(cleanText(msg), "[[:blank:]\t]+")))
# drop empty and 1 letter words
words = words[nchar(words) > 1]
words = words[!(words %in% stopWords)]
invisible(words)
}
processAllWords = function(dirName, stopWords) {
# read all files in the directory
fileNames = list.files(dirName, full.names = TRUE)
# drop files that are not email, i.e., cmds
notEmail = grep("cmds$", fileNames)
if (length(notEmail) > 0)
fileNames = fileNames[-notEmail]
messages = lapply(fileNames, readLines, encoding = "utf-8")
# split header and body
emailSplit = lapply(messages, splitMessage)
# put body and header in own lists
bodyList = lapply(emailSplit, function(msg) msg$body)
headerList = lapply(emailSplit, function(msg) msg$header)
rm(emailSplit)
# determine which messages have attachments
hasAttach = sapply(headerList, function(header) {
CTloc = grep("Content-Type", header)
if (length(CTloc) == 0)
return(0)
multi = grep("multi", tolower(header[CTloc]))
if (length(multi) == 0)
return(0)
multi
})
hasAttach = which(hasAttach > 0)
# find boundary strings for messages with attachments
boundaries = sapply(headerList[hasAttach], getBoundary)
# drop attachments from message body
bodyList[hasAttach] = mapply(dropAttach, bodyList[hasAttach], boundaries,
SIMPLIFY = FALSE)
# extract words from body
msgWordsList = lapply(bodyList, findMsgWords, stopWords)
invisible(msgWordsList)
}
msgWordsList = lapply(fullDirNames, processAllWords, stopWords = stopWords)
numMsgs = sapply(msgWordsList, length)
numMsgs
#> [1] 5051 1400 500 1000 1397
isSpam = rep(c(FALSE, FALSE, FALSE, TRUE, TRUE), numMsgs)
msgWordsList = unlist(msgWordsList, recursive = FALSE)
numEmail = length(isSpam)
numSpam = sum(isSpam)
numHam = numEmail - numSpam
set.seed(418910)
testSpamIdx = sample(numSpam, size = floor(numSpam/3))
testHamIdx = sample(numHam, size = floor(numHam/3))
testMsgWords = c((msgWordsList[isSpam])[testSpamIdx], (msgWordsList[!isSpam])[testHamIdx])
trainMsgWords = c((msgWordsList[isSpam])[-testSpamIdx], (msgWordsList[!isSpam])[-testHamIdx])
testIsSpam = rep(c(TRUE, FALSE), c(length(testSpamIdx), length(testHamIdx)))
trainIsSpam = rep(c(TRUE, FALSE), c(numSpam - length(testSpamIdx), numHam -
length(testHamIdx)))
bow = unique(unlist(trainMsgWords))
length(bow)
#> [1] 80059
spamWordCounts = rep(0, length(bow))
names(spamWordCounts) = bow
tmp = lapply(trainMsgWords[trainIsSpam], unique)
tt = table(unlist(tmp))
spamWordCounts[names(tt)] = tt
computeFreqs = function(wordsList, spam, bow = unique(unlist(wordsList))) {
# create a matrix for spam, ham, and log odds
wordTable = matrix(0.5, nrow = 4, ncol = length(bow), dimnames = list(c("spam",
"ham", "presentLogOdds", "absentLogOdds"), bow))
# For each spam message, add 1 to counts for words in message
counts.spam = table(unlist(lapply(wordsList[spam], unique)))
wordTable["spam", names(counts.spam)] = counts.spam + 0.5
# Similarly for ham messages
counts.ham = table(unlist(lapply(wordsList[!spam], unique)))
wordTable["ham", names(counts.ham)] = counts.ham + 0.5
# Find the total number of spam and ham
numSpam = sum(spam)
numHam = length(spam) - numSpam
# Prob(word|spam) and Prob(word | ham)
wordTable["spam", ] = wordTable["spam", ]/(numSpam + 0.5)
wordTable["ham", ] = wordTable["ham", ]/(numHam + 0.5)
# log odds
wordTable["presentLogOdds", ] = log(wordTable["spam", ]) - log(wordTable["ham",
])
wordTable["absentLogOdds", ] = log((1 - wordTable["spam", ])) - log((1 -
wordTable["ham", ]))
invisible(wordTable)
}
trainTable = computeFreqs(trainMsgWords, trainIsSpam)
newMsg = testMsgWords[[1]]
newMsg = newMsg[!is.na(match(newMsg, colnames(trainTable)))]
present = colnames(trainTable) %in% newMsg
sum(trainTable["presentLogOdds", present]) + sum(trainTable["absentLogOdds",
!present])
#> [1] 255
newMsg = testMsgWords[[which(!testIsSpam)[1]]]
newMsg = newMsg[!is.na(match(newMsg, colnames(trainTable)))]
present = (colnames(trainTable) %in% newMsg)
sum(trainTable["presentLogOdds", present]) + sum(trainTable["absentLogOdds",
!present])
#> [1] -124
computeMsgLLR = function(words, freqTable) {
# Discards words not in training data.
words = words[!is.na(match(words, colnames(freqTable)))]
# Find which words are present
present = colnames(freqTable) %in% words
sum(freqTable["presentLogOdds", present]) + sum(freqTable["absentLogOdds",
!present])
}
testLLR = sapply(testMsgWords, computeMsgLLR, trainTable)
tapply(testLLR, testIsSpam, summary)
#> $`FALSE`
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> -1362 -127 -101 -116 -81 700
#>
#> $`TRUE`
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> -61 6 50 138 132 23518
#pdf("SP_Boxplot.pdf", width = 6, height = 6)
spamLab = c("ham", "spam")[1 + testIsSpam]
boxplot(testLLR ~ spamLab, ylab = "Log Likelihood Ratio",
# main = "Log Likelihood Ratio for Randomly Chosen Test Messages",
ylim=c(-500, 500))

typeIErrorRate = function(tau, llrVals, spam) {
classify = llrVals > tau
sum(classify & !spam)/sum(!spam)
}
typeIErrorRate(0, testLLR, testIsSpam)
#> [1] 0.0035
typeIErrorRate(-20, testLLR, testIsSpam)
#> [1] 0.0056
typeIErrorRates = function(llrVals, isSpam) {
o = order(llrVals)
llrVals = llrVals[o]
isSpam = isSpam[o]
idx = which(!isSpam)
N = length(idx)
list(error = (N:1)/N, values = llrVals[idx])
}
typeIIErrorRates = function(llrVals, isSpam) {
o = order(llrVals)
llrVals = llrVals[o]
isSpam = isSpam[o]
idx = which(isSpam)
N = length(idx)
list(error = (1:(N))/N, values = llrVals[idx])
}
xI = typeIErrorRates(testLLR, testIsSpam)
xII = typeIIErrorRates(testLLR, testIsSpam)
tau01 = round(min(xI$values[xI$error <= 0.01]))
t2 = max(xII$error[xII$values < tau01])
# pdf('LinePlotTypeI+IIErrors.pdf', width = 8, height = 6)
library(RColorBrewer)
cols = brewer.pal(9, "Set1")[c(3, 4, 5)]
plot(xII$error ~ xII$values, type = "l", col = cols[1], lwd = 3, xlim = c(-300,
250), ylim = c(0, 1), xlab = "Log Likelihood Ratio Values", ylab = "Error Rate")
points(xI$error ~ xI$values, type = "l", col = cols[2], lwd = 3)
legend(x = 50, y = 0.4, fill = c(cols[2], cols[1]), legend = c("Classify Ham as Spam",
"Classify Spam as Ham"), cex = 0.8, bty = "n")
abline(h = 0.01, col = "grey", lwd = 3, lty = 2)
text(-250, 0.05, pos = 4, "Type I Error = 0.01", col = cols[2])
mtext(tau01, side = 1, line = 0.5, at = tau01, col = cols[3])
segments(x0 = tau01, y0 = -0.5, x1 = tau01, y1 = t2, lwd = 2, col = "grey")
text(tau01 + 20, 0.05, pos = 4, paste("Type II Error = ", round(t2, digits = 2)),
col = cols[1])

k = 5
numTrain = length(trainMsgWords)
partK = sample(numTrain)
tot = k * floor(numTrain/k)
partK = matrix(partK[1:tot], ncol = k)
testFoldOdds = NULL
for (i in 1:k) {
foldIdx = partK[, i]
trainTabFold = computeFreqs(trainMsgWords[-foldIdx], trainIsSpam[-foldIdx])
testFoldOdds = c(testFoldOdds, sapply(trainMsgWords[foldIdx], computeMsgLLR,
trainTabFold))
}
testFoldSpam = NULL
for (i in 1:k) {
foldIdx = partK[, i]
testFoldSpam = c(testFoldSpam, trainIsSpam[foldIdx])
}
xFoldI = typeIErrorRates(testFoldOdds, testFoldSpam)
xFoldII = typeIIErrorRates(testFoldOdds, testFoldSpam)
tauFoldI = round(min(xFoldI$values[xFoldI$error <= 0.01]))
tFold2 = xFoldII$error[xFoldII$values < tauFoldI]
smallNums = rep((1/2)^40, 2e+06)
largeNum = 10000
print(sum(smallNums), digits = 20)
#> [1] 1.8189894035458565e-06
print(largeNum + sum(smallNums), digits = 20)
#> [1] 10000.000001818989
for (i in 1:length(smallNums)) {
largeNum = largeNum + smallNums[i]
}
print(largeNum, digits = 20)
#> [1] 10000
sampleSplit = lapply(sampleEmail, splitMessage)
header = sampleSplit[[1]]$header
header[1:12]
#> [1] "From exmh-workers-admin@redhat.com Thu Aug 22 12:36:23 2002"
#> [2] "Return-Path: <exmh-workers-admin@spamassassin.taint.org>"
#> [3] "Delivered-To: zzzz@localhost.netnoteinc.com"
#> [4] "Received: from localhost (localhost [127.0.0.1])"
#> [5] "\tby phobos.labs.netnoteinc.com (Postfix) with ESMTP id D03E543C36"
#> [6] "\tfor <zzzz@localhost>; Thu, 22 Aug 2002 07:36:16 -0400 (EDT)"
#> [7] "Received: from phobos [127.0.0.1]"
#> [8] "\tby localhost with IMAP (fetchmail-5.9.0)"
#> [9] "\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 12:36:16 +0100 (IST)"
#> [10] "Received: from listman.spamassassin.taint.org (listman.spamassassin.taint.org [66.187.233.211]) by"
#> [11] " dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7MBYrZ04811 for"
#> [12] " <zzzz-exmh@spamassassin.taint.org>; Thu, 22 Aug 2002 12:34:53 +0100"
header[1] = sub("^From", "Top-From:", header[1])
header[1]
#> [1] "Top-From: exmh-workers-admin@redhat.com Thu Aug 22 12:36:23 2002"
headerPieces = read.dcf(textConnection(header), all = TRUE)
headerPieces[, "Delivered-To"]
#> [[1]]
#> [1] "zzzz@localhost.netnoteinc.com"
#> [2] "exmh-workers@listman.spamassassin.taint.org"
headerVec = unlist(headerPieces)
dupKeys = sapply(headerPieces, function(x) length(unlist(x)))
names(headerVec) = rep(colnames(headerPieces), dupKeys)
headerVec[which(names(headerVec) == "Delivered-To")]
#> Delivered-To
#> "zzzz@localhost.netnoteinc.com"
#> Delivered-To
#> "exmh-workers@listman.spamassassin.taint.org"
#> [1] 36
length(unique(names(headerVec)))
#> [1] 26
processHeader = function(header) {
# modify the first line to create a key:value pair
header[1] = sub("^From", "Top-From:", header[1])
headerMat = read.dcf(textConnection(header), all = TRUE)
headerVec = unlist(headerMat)
dupKeys = sapply(headerMat, function(x) length(unlist(x)))
names(headerVec) = rep(colnames(headerMat), dupKeys)
return(headerVec)
}
headerList = lapply(sampleSplit, function(msg) {
processHeader(msg$header)
})
contentTypes = sapply(headerList, function(header) header["Content-Type"])
names(contentTypes) = NULL
contentTypes
#> [1] "text/plain; charset=us-ascii"
#> [2] "text/plain; charset=US-ASCII"
#> [3] "text/plain; charset=US-ASCII"
#> [4] "text/plain; charset=\"us-ascii\""
#> [5] "text/plain; charset=US-ASCII"
#> [6] "multipart/signed;\nboundary=\"==_Exmh_-1317289252P\";\nmicalg=pgp-sha1;\nprotocol=\"application/pgp-signature\""
#> [7] NA
#> [8] "multipart/alternative;\nboundary=\"----=_NextPart_000_00C1_01C25017.F2F04E20\""
#> [9] "multipart/alternative; boundary=Apple-Mail-2-874629474"
#> [10] "multipart/signed;\nboundary=\"==_Exmh_-518574644P\";\nmicalg=pgp-sha1;\nprotocol=\"application/pgp-signature\""
#> [11] "multipart/related;\nboundary=\"------------090602010909000705010009\""
#> [12] "multipart/signed;\nboundary=\"==_Exmh_-451422450P\";\nmicalg=pgp-sha1;\nprotocol=\"application/pgp-signature\""
#> [13] "multipart/signed;\nboundary=\"==_Exmh_267413022P\";\nmicalg=pgp-sha1;\nprotocol=\"application/pgp-signature\""
#> [14] "multipart/mixed;\nboundary=\"----=_NextPart_000_0005_01C26412.7545C1D0\""
#> [15] "multipart/alternative;\nboundary=\"------------080209060700030309080805\""
hasAttach = grep("^ *multi", tolower(contentTypes))
hasAttach
#> [1] 6 8 9 10 11 12 13 14 15
boundaries = getBoundary(contentTypes[hasAttach])
boundaries
#> [1] "==_Exmh_-1317289252P"
#> [2] "----=_NextPart_000_00C1_01C25017.F2F04E20"
#> [3] "Apple-Mail-2-874629474"
#> [4] "==_Exmh_-518574644P"
#> [5] "------------090602010909000705010009"
#> [6] "==_Exmh_-451422450P"
#> [7] "==_Exmh_267413022P"
#> [8] "----=_NextPart_000_0005_01C26412.7545C1D0"
#> [9] "------------080209060700030309080805"
boundary = boundaries[9]
body = sampleSplit[[15]]$body
bString = paste("--", boundary, sep = "")
bStringLocs = which(bString == body)
bStringLocs
#> [1] 2 35
eString = paste("--", boundary, "--", sep = "")
eStringLoc = which(eString == body)
eStringLoc
#> [1] 77
diff(c(bStringLocs[-1], eStringLoc))
#> [1] 42
### This code has mistakes in it - and we fix them later!
processAttach = function(body, contentType) {
boundary = getBoundary(contentType)
bString = paste("--", boundary, "$", sep = "")
bStringLocs = grep(bString, body)
eString = paste("--", boundary, "--$", sep = "")
eStringLoc = grep(eString, body)
n = length(body)
if (length(eStringLoc) == 0)
eStringLoc = n + 1
if (length(bStringLocs) == 1)
attachLocs = NULL else attachLocs = c(bStringLocs[-1], eStringLoc)
msg = body[(bStringLocs[1] + 1):min(n, (bStringLocs[2] - 1), na.rm = TRUE)]
if (eStringLoc < n)
msg = c(msg, body[(eStringLoc + 1):n])
if (!is.null(attachLocs)) {
attachLens = diff(attachLocs, lag = 1)
attachTypes = mapply(function(begL, endL) {
contentTypeLoc = grep("[Cc]ontent-[Tt]ype", body[(begL + 1):(endL -
1)])
contentType = body[begL + contentTypeLoc]
contentType = gsub("\"", "", contentType)
MIMEType = sub(" *Content-Type: *([^;]*);?.*", "\\1", contentType)
return(MIMEType)
}, attachLocs[-length(attachLocs)], attachLocs[-1])
}
if (is.null(attachLocs))
return(list(body = msg, attachInfo = NULL)) else return(list(body = msg, attachDF = data.frame(aLen = attachLens, aType = attachTypes,
stringsAsFactors = FALSE)))
}
bodyList = lapply(sampleSplit, function(msg) msg$body)
attList = mapply(processAttach, bodyList[hasAttach], contentTypes[hasAttach],
SIMPLIFY = FALSE)
lens = sapply(attList, function(processedA) processedA$attachDF$aLen)
head(lens) # will cause an error with the first message. its ok.
#> $`../Data/spam//messages/easy_ham/00014.cb20e10b2bfcb8210a1c310798532a57`
#> [1] 12
#>
#> $`../Data/spam//messages/easy_ham/00062.009f5a1a8fa88f0b38299ad01562bb37`
#> [1] 44 44
#>
#> $`../Data/spam//messages/easy_ham/00063.0acbc484a73f0e0b727e06c100d8df7b`
#> [1] 83
#>
#> $`../Data/spam//messages/easy_ham/0030.77828e31de08ebb58b583688b87524cc`
#> [1] 12
#>
#> $`../Data/spam//messages/easy_ham/00368.f86324a03e7ae7070cc40f302385f5d3`
#> NULL
#>
#> $`../Data/spam//messages/easy_ham/00389.8606961eaeef7b921ce1c53773248d69`
#> [1] 12
body = bodyList[hasAttach][[2]]
length(body)
#> [1] 86
#> [1] ""
#> [2] "------=_NextPart_000_00C1_01C25017.F2F04E20"
#> [3] "Content-Type: text/html;"
#> [4] "\tcharset=\"Windows-1252\""
#> [5] "Content-Transfer-Encoding: quoted-printable"
#> [6] ""
#> [7] "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">"
#> [8] "<HTML><HEAD>"
#> [9] "<META http-equiv=3DContent-Type content=3D\"text/html; ="
#> [10] "charset=3Dwindows-1252\">"
#> [11] "<META content=3D\"MSHTML 6.00.2716.2200\" name=3DGENERATOR>"
processAttach = function(body, contentType) {
n = length(body)
boundary = getBoundary(contentType)
bString = paste("--", boundary, sep = "")
bStringLocs = which(bString == body)
eString = paste("--", boundary, "--", sep = "")
eStringLoc = which(eString == body)
if (length(eStringLoc) == 0)
eStringLoc = n
if (length(bStringLocs) <= 1) {
attachLocs = NULL
msgLastLine = n
if (length(bStringLocs) == 0)
bStringLocs = 0
} else {
attachLocs = c(bStringLocs[-1], eStringLoc)
msgLastLine = bStringLocs[2] - 1
}
msg = body[(bStringLocs[1] + 1):msgLastLine]
if (eStringLoc < n)
msg = c(msg, body[(eStringLoc + 1):n])
if (!is.null(attachLocs)) {
attachLens = diff(attachLocs, lag = 1)
attachTypes = mapply(function(begL, endL) {
CTloc = grep("^[Cc]ontent-[Tt]ype", body[(begL + 1):(endL - 1)])
if (length(CTloc) == 0) {
MIMEType = NA
} else {
CTval = body[begL + CTloc[1]]
CTval = gsub("\"", "", CTval)
MIMEType = sub(" *[Cc]ontent-[Tt]ype: *([^;]*);?.*", "\\1",
CTval)
}
return(MIMEType)
}, attachLocs[-length(attachLocs)], attachLocs[-1])
}
if (is.null(attachLocs))
return(list(body = msg, attachDF = NULL))
return(list(body = msg, attachDF = data.frame(aLen = attachLens, aType = unlist(attachTypes),
stringsAsFactors = FALSE)))
}
readEmail = function(dirName) {
# retrieve the names of files in directory
fileNames = list.files(dirName, full.names = TRUE)
# drop files that are not email
notEmail = grep("cmds$", fileNames)
if (length(notEmail) > 0)
fileNames = fileNames[-notEmail]
# read all files in the directory
lapply(fileNames, readLines, encoding = "latin1")
}
processAllEmail = function(dirName, isSpam = FALSE) {
# read all files in the directory
messages = readEmail(dirName)
fileNames = names(messages)
n = length(messages)
# split header from body
eSplit = lapply(messages, splitMessage)
rm(messages)
# process header as named character vector
headerList = lapply(eSplit, function(msg) processHeader(msg$header))
# extract content-type key
contentTypes = sapply(headerList, function(header) header["Content-Type"])
# extract the body
bodyList = lapply(eSplit, function(msg) msg$body)
rm(eSplit)
# which email have attachments
hasAttach = grep("^ *multi", tolower(contentTypes))
# get summary stats for attachments and the shorter body
attList = mapply(processAttach, bodyList[hasAttach], contentTypes[hasAttach],
SIMPLIFY = FALSE)
bodyList[hasAttach] = lapply(attList, function(attEl) attEl$body)
attachInfo = vector("list", length = n)
attachInfo[hasAttach] = lapply(attList, function(attEl) attEl$attachDF)
# prepare return structure
emailList = mapply(function(header, body, attach, isSpam) {
list(isSpam = isSpam, header = header, body = body, attach = attach)
}, headerList, bodyList, attachInfo, rep(isSpam, n), SIMPLIFY = FALSE)
names(emailList) = fileNames
invisible(emailList)
}
emailStruct = mapply(processAllEmail, fullDirNames, isSpam = rep(c(FALSE, TRUE),
3:2))
emailStruct = unlist(emailStruct, recursive = FALSE)
sampleStruct = emailStruct[indx]
save(emailStruct, file = "emailXX.rda")
header = sampleStruct[[1]]$header
subject = header["Subject"]
els = strsplit(subject, "")
all(els %in% LETTERS)
#> [1] FALSE
testSubject = c("DEAR MADAME", "WINNER!", "")
els = strsplit(testSubject, "")
sapply(els, function(subject) all(subject %in% LETTERS))
#> [1] FALSE FALSE TRUE
gsub("[[:punct:] ]", "", testSubject)
#> [1] "DEARMADAME" "WINNER" ""
gsub("[^[:alpha:]]", "", testSubject)
#> [1] "DEARMADAME" "WINNER" ""
isYelling = function(msg) {
if ("Subject" %in% names(msg$header)) {
el = gsub("[^[:alpha:]]", "", msg$header["Subject"])
if (nchar(el) > 0)
nchar(gsub("[A-Z]", "", el)) < 1 else FALSE
} else NA
}
perCaps = function(msg) {
body = paste(msg$body, collapse = "")
# Return NA if the body of the message is 'empty'
if (length(body) == 0 || nchar(body) == 0)
return(NA)
# Eliminate non-alpha characters
body = gsub("[^[:alpha:]]", "", body)
capText = gsub("[^A-Z]", "", body)
100 * nchar(capText)/nchar(body)
}
sapply(sampleStruct, perCaps)
#> ../Data/spam//messages/easy_ham1 ../Data/spam//messages/easy_ham2
#> 4.5 7.5
#> ../Data/spam//messages/easy_ham3 ../Data/spam//messages/easy_ham4
#> 7.4 5.1
#> ../Data/spam//messages/easy_ham5 ../Data/spam//messages/easy_ham15
#> 6.1 7.7
#> ../Data/spam//messages/easy_ham27 ../Data/spam//messages/easy_ham68
#> 5.5 10.1
#> ../Data/spam//messages/easy_ham69 ../Data/spam//messages/easy_ham329
#> 10.9 6.5
#> ../Data/spam//messages/easy_ham404 ../Data/spam//messages/easy_ham427
#> 9.6 12.0
#> ../Data/spam//messages/easy_ham516 ../Data/spam//messages/easy_ham852
#> 9.2 1.7
#> ../Data/spam//messages/easy_ham971
#> 6.4
funcList = list(isRe = function(msg) {
"Subject" %in% names(msg$header) && length(grep("^[ \t]*Re:", msg$header[["Subject"]])) >
0
}, numLines = function(msg) length(msg$body), isYelling = function(msg) {
if ("Subject" %in% names(msg$header)) {
el = gsub("[^[:alpha:]]", "", msg$header["Subject"])
if (nchar(el) > 0) nchar(gsub("[A-Z]", "", el)) < 1 else FALSE
} else NA
}, perCaps = function(msg) {
body = paste(msg$body, collapse = "")
# Return NA if the body of the message is 'empty'
if (length(body) == 0 || nchar(body) == 0) return(NA)
# Eliminate non-alpha characters
body = gsub("[^[:alpha:]]", "", body)
capText = gsub("[^A-Z]", "", body)
100 * nchar(capText)/nchar(body)
})
lapply(funcList, function(func) sapply(sampleStruct, function(msg) func(msg)))
#> $isRe
#> ../Data/spam//messages/easy_ham1 ../Data/spam//messages/easy_ham2
#> TRUE FALSE
#> ../Data/spam//messages/easy_ham3 ../Data/spam//messages/easy_ham4
#> FALSE FALSE
#> ../Data/spam//messages/easy_ham5 ../Data/spam//messages/easy_ham15
#> TRUE TRUE
#> ../Data/spam//messages/easy_ham27 ../Data/spam//messages/easy_ham68
#> TRUE FALSE
#> ../Data/spam//messages/easy_ham69 ../Data/spam//messages/easy_ham329
#> TRUE TRUE
#> ../Data/spam//messages/easy_ham404 ../Data/spam//messages/easy_ham427
#> TRUE TRUE
#> ../Data/spam//messages/easy_ham516 ../Data/spam//messages/easy_ham852
#> TRUE FALSE
#> ../Data/spam//messages/easy_ham971
#> TRUE
#>
#> $numLines
#> ../Data/spam//messages/easy_ham1 ../Data/spam//messages/easy_ham2
#> 50 26
#> ../Data/spam//messages/easy_ham3 ../Data/spam//messages/easy_ham4
#> 38 32
#> ../Data/spam//messages/easy_ham5 ../Data/spam//messages/easy_ham15
#> 31 54
#> ../Data/spam//messages/easy_ham27 ../Data/spam//messages/easy_ham68
#> 35 36
#> ../Data/spam//messages/easy_ham69 ../Data/spam//messages/easy_ham329
#> 65 58
#> ../Data/spam//messages/easy_ham404 ../Data/spam//messages/easy_ham427
#> 70 31
#> ../Data/spam//messages/easy_ham516 ../Data/spam//messages/easy_ham852
#> 38 28
#> ../Data/spam//messages/easy_ham971
#> 34
#>
#> $isYelling
#> ../Data/spam//messages/easy_ham1.Subject
#> FALSE
#> ../Data/spam//messages/easy_ham2.Subject
#> FALSE
#> ../Data/spam//messages/easy_ham3.Subject
#> FALSE
#> ../Data/spam//messages/easy_ham4.Subject
#> FALSE
#> ../Data/spam//messages/easy_ham5.Subject
#> FALSE
#> ../Data/spam//messages/easy_ham15.Subject
#> FALSE
#> ../Data/spam//messages/easy_ham27.Subject
#> FALSE
#> ../Data/spam//messages/easy_ham68.Subject
#> FALSE
#> ../Data/spam//messages/easy_ham69.Subject
#> FALSE
#> ../Data/spam//messages/easy_ham329.Subject
#> FALSE
#> ../Data/spam//messages/easy_ham404.Subject
#> FALSE
#> ../Data/spam//messages/easy_ham427.Subject
#> FALSE
#> ../Data/spam//messages/easy_ham516.Subject
#> FALSE
#> ../Data/spam//messages/easy_ham852.Subject
#> FALSE
#> ../Data/spam//messages/easy_ham971.Subject
#> FALSE
#>
#> $perCaps
#> ../Data/spam//messages/easy_ham1 ../Data/spam//messages/easy_ham2
#> 4.5 7.5
#> ../Data/spam//messages/easy_ham3 ../Data/spam//messages/easy_ham4
#> 7.4 5.1
#> ../Data/spam//messages/easy_ham5 ../Data/spam//messages/easy_ham15
#> 6.1 7.7
#> ../Data/spam//messages/easy_ham27 ../Data/spam//messages/easy_ham68
#> 5.5 10.1
#> ../Data/spam//messages/easy_ham69 ../Data/spam//messages/easy_ham329
#> 10.9 6.5
#> ../Data/spam//messages/easy_ham404 ../Data/spam//messages/easy_ham427
#> 9.6 12.0
#> ../Data/spam//messages/easy_ham516 ../Data/spam//messages/easy_ham852
#> 9.2 1.7
#> ../Data/spam//messages/easy_ham971
#> 6.4
createDerivedDF = function(email = emailStruct, operations = funcList, verbose = FALSE) {
els = lapply(names(operations), function(id) {
if (verbose)
print(id)
e = operations[[id]]
v = if (is.function(e))
sapply(email, e) else sapply(email, function(msg) eval(e))
v
})
df = as.data.frame(els)
names(df) = names(operations)
invisible(df)
}
sampleDF = createDerivedDF(sampleStruct)
head(sampleDF)
funcList = list(
isSpam =
expression(msg$isSpam)
,
isRe =
function(msg) {
# Can have a Fwd: Re: ... but we are not looking for this here.
# We may want to look at In-Reply-To field.
"Subject" %in% names(msg$header) &&
length(grep("^[ \t]*Re:", msg$header[["Subject"]])) > 0
}
,
numLines =
function(msg) length(msg$body)
,
bodyCharCt =
function(msg)
sum(nchar(msg$body))
,
underscore =
function(msg) {
if(!"Reply-To" %in% names(msg$header))
return(FALSE)
txt <- msg$header[["Reply-To"]]
length(grep("_", txt)) > 0 &&
length(grep("[0-9A-Za-z]+", txt)) > 0
}
,
subExcCt =
function(msg) {
x = msg$header["Subject"]
if(length(x) == 0 || sum(nchar(x)) == 0 || is.na(x))
return(NA)
sum(nchar(gsub("[^!]","", x)))
}
,
subQuesCt =
function(msg) {
x = msg$header["Subject"]
if(length(x) == 0 || sum(nchar(x)) == 0 || is.na(x))
return(NA)
sum(nchar(gsub("[^?]","", x)))
}
,
numAtt =
function(msg) {
if (is.null(msg$attach)) return(0)
else nrow(msg$attach)
}
,
priority =
function(msg) {
ans <- FALSE
# Look for names X-Priority, Priority, X-Msmail-Priority
# Look for high any where in the value
ind = grep("priority", tolower(names(msg$header)))
if (length(ind) > 0) {
ans <- length(grep("high", tolower(msg$header[ind]))) >0
}
ans
}
,
numRec =
function(msg) {
# unique or not.
els = getMessageRecipients(msg$header)
if(length(els) == 0)
return(NA)
# Split each line by "," and in each of these elements, look for
# the @ sign. This handles
tmp = sapply(strsplit(els, ","), function(x) grep("@", x))
sum(sapply(tmp, length))
}
,
perCaps =
function(msg)
{
body = paste(msg$body, collapse = "")
# Return NA if the body of the message is "empty"
if(length(body) == 0 || nchar(body) == 0) return(NA)
# Eliminate non-alpha characters and empty lines
body = gsub("[^[:alpha:]]", "", body)
els = unlist(strsplit(body, ""))
ctCap = sum(els %in% LETTERS)
100 * ctCap / length(els)
}
,
isInReplyTo =
function(msg)
{
"In-Reply-To" %in% names(msg$header)
}
,
sortedRec =
function(msg)
{
ids = getMessageRecipients(msg$header)
all(sort(ids) == ids)
}
,
subPunc =
function(msg)
{
if("Subject" %in% names(msg$header)) {
el = gsub("['/.:@-]", "", msg$header["Subject"])
length(grep("[A-Za-z][[:punct:]]+[A-Za-z]", el)) > 0
}
else
FALSE
},
hour =
function(msg)
{
date = msg$header["Date"]
if ( is.null(date) ) return(NA)
# Need to handle that there may be only one digit in the hour
locate = regexpr("[0-2]?[0-9]:[0-5][0-9]:[0-5][0-9]", date)
if (locate < 0)
locate = regexpr("[0-2]?[0-9]:[0-5][0-9]", date)
if (locate < 0) return(NA)
hour = substring(date, locate, locate+1)
hour = as.numeric(gsub(":", "", hour))
locate = regexpr("PM", date)
if (locate > 0) hour = hour + 12
locate = regexpr("[+-][0-2][0-9]00", date)
if (locate < 0) offset = 0
else offset = as.numeric(substring(date, locate, locate + 2))
(hour - offset) %% 24
}
,
multipartText =
function(msg)
{
if (is.null(msg$attach)) return(FALSE)
numAtt = nrow(msg$attach)
types =
length(grep("(html|plain|text)", msg$attach$aType)) > (numAtt/2)
}
,
hasImages =
function(msg)
{
if (is.null(msg$attach)) return(FALSE)
length(grep("^ *image", tolower(msg$attach$aType))) > 0
}
,
isPGPsigned =
function(msg)
{
if (is.null(msg$attach)) return(FALSE)
length(grep("pgp", tolower(msg$attach$aType))) > 0
},
perHTML =
function(msg)
{
if(! ("Content-Type" %in% names(msg$header))) return(0)
el = tolower(msg$header["Content-Type"])
if (length(grep("html", el)) == 0) return(0)
els = gsub("[[:space:]]", "", msg$body)
totchar = sum(nchar(els))
totplain = sum(nchar(gsub("<[^<]+>", "", els )))
100 * (totchar - totplain)/totchar
},
subSpamWords =
function(msg)
{
if("Subject" %in% names(msg$header))
length(grep(paste(SpamCheckWords, collapse = "|"),
tolower(msg$header["Subject"]))) > 0
else
NA
}
,
subBlanks =
function(msg)
{
if("Subject" %in% names(msg$header)) {
x = msg$header["Subject"]
# should we count blank subject line as 0 or 1 or NA?
if (nchar(x) == 1) return(0)
else 100 *(1 - (nchar(gsub("[[:blank:]]", "", x))/nchar(x)))
} else NA
}
,
noHost =
function(msg)
{
# Or use partial matching.
idx = pmatch("Message-", names(msg$header))
if(is.na(idx)) return(NA)
tmp = msg$header[idx]
return(length(grep(".*@[^[:space:]]+", tmp)) == 0)
}
,
numEnd =
function(msg)
{
# If we just do a grep("[0-9]@", )
# we get matches on messages that have a From something like
# " \"marty66@aol.com\" <synjan@ecis.com>"
# and the marty66 is the "user's name" not the login
# So we can be more precise if we want.
x = names(msg$header)
if ( !( "From" %in% x) ) return(NA)
login = gsub("^.*<", "", msg$header["From"])
if ( is.null(login) )
login = gsub("^.*<", "", msg$header["X-From"])
if ( is.null(login) ) return(NA)
login = strsplit(login, "@")[[1]][1]
length(grep("[0-9]+$", login)) > 0
},
isYelling =
function(msg)
{
if ( "Subject" %in% names(msg$header) ) {
el = gsub("[^[:alpha:]]", "", msg$header["Subject"])
if (nchar(el) > 0) nchar(gsub("[A-Z]", "", el)) < 1
else FALSE
}
else
NA
},
forwards =
function(msg)
{
x = msg$body
if(length(x) == 0 || sum(nchar(x)) == 0)
return(NA)
ans = length(grep("^[[:space:]]*>", x))
100 * ans / length(x)
},
isOrigMsg =
function(msg)
{
x = msg$body
if(length(x) == 0) return(NA)
length(grep("^[^[:alpha:]]*original[^[:alpha:]]+message[^[:alpha:]]*$",
tolower(x) ) ) > 0
},
isDear =
function(msg)
{
x = msg$body
if(length(x) == 0) return(NA)
length(grep("^[[:blank:]]*dear +(sir|madam)\\>",
tolower(x))) > 0
},
isWrote =
function(msg)
{
x = msg$body
if(length(x) == 0) return(NA)
length(grep("(wrote|schrieb|ecrit|escribe):", tolower(x) )) > 0
},
avgWordLen =
function(msg)
{
txt = paste(msg$body, collapse = " ")
if(length(txt) == 0 || sum(nchar(txt)) == 0) return(0)
txt = gsub("[^[:alpha:]]", " ", txt)
words = unlist(strsplit(txt, "[[:blank:]]+"))
wordLens = nchar(words)
mean(wordLens[ wordLens > 0 ])
}
,
numDlr =
function(msg)
{
x = paste(msg$body, collapse = "")
if(length(x) == 0 || sum(nchar(x)) == 0)
return(NA)
nchar(gsub("[^$]","", x))
}
)
SpamCheckWords = c("viagra", "pounds", "free", "weight", "guarantee", "million",
"dollars", "credit", "risk", "prescription", "generic", "drug", "financial",
"save", "dollar", "erotic", "million", "barrister", "beneficiary", "easy",
"money back", "money", "credit card")
getMessageRecipients = function(header) {
c(if ("To" %in% names(header)) header[["To"]] else character(0), if ("Cc" %in%
names(header)) header[["Cc"]] else character(0), if ("Bcc" %in% names(header)) header[["Bcc"]] else character(0))
}
emailDF = createDerivedDF(emailStruct)
dim(emailDF)
#> [1] 9348 30
save(emailDF, file = "emailDF.rda")
# load('Data/spamAssassinDerivedDF.rda')
dim(emailDF)
#> [1] 9348 30
perCaps2 = function(msg) {
body = paste(msg$body, collapse = "")
# Return NA if the body of the message is 'empty'
if (length(body) == 0 || nchar(body) == 0)
return(NA)
# Eliminate non-alpha characters and empty lines
body = gsub("[^[:alpha:]]", "", body)
els = unlist(strsplit(body, ""))
ctCap = sum(els %in% LETTERS)
100 * ctCap/length(els)
}
pC = sapply(emailStruct, perCaps)
pC2 = sapply(emailStruct, perCaps2)
identical(pC, pC2)
#> [1] TRUE
indNA = which(is.na(emailDF$subExcCt))
indNoSubject = which(sapply(emailStruct, function(msg) !("Subject" %in% names(msg$header))))
all(indNA == indNoSubject)
#> [1] FALSE
all(emailDF$bodyCharCt > emailDF$numLines)
#> [1] TRUE
x.at = c(1, 10, 100, 1000, 10000, 1e+05)
y.at = c(1, 5, 10, 50, 100, 500, 5000)
nL = 1 + emailDF$numLines
nC = 1 + emailDF$bodyCharCt
# pdf('ScatterPlotNumLinesNumChars.pdf', width = 6, height = 4.5)
plot(nL ~ nC, log = "xy", pch = ".", xlim = c(1, 1e+05), axes = FALSE, xlab = "Number of Characters",
ylab = "Number of Lines")
box()
axis(1, at = x.at, labels = formatC(x.at, digits = 0, format = "d"))
axis(2, at = y.at, labels = formatC(y.at, digits = 0, format = "d"))
abline(a = 0, b = 1, col = "red", lwd = 2)

#> null device
#> 1
# pdf('SPAM_boxplotsPercentCaps.pdf', width = 5, height = 5)
percent = emailDF$perCaps
isSpamLabs = factor(emailDF$isSpam, labels = c("ham", "spam"))
boxplot(log(1 + percent) ~ isSpamLabs, ylab = "Percent Capitals (log)")
# dev.off()
logPerCapsSpam = log(1 + emailDF$perCaps[emailDF$isSpam])
logPerCapsHam = log(1 + emailDF$perCaps[!emailDF$isSpam])
qqplot(logPerCapsSpam, logPerCapsHam, xlab = "Regular Email", ylab = "Spam Email",
main = "Percentage of Capital Letters (log scale)", pch = 19, cex = 0.3)

# pdf('SPAM_scatterplotPercentCapsTotChars.pdf', width = 8, height = 6)
colI = c("#4DAF4A80", "#984EA380")
logBodyCharCt = log(1 + emailDF$bodyCharCt)
logPerCaps = log(1 + emailDF$perCaps)
plot(logPerCaps ~ logBodyCharCt, xlab = "Total Characters (log)", ylab = "Percent Capitals (log)",
col = colI[1 + emailDF$isSpam], xlim = c(2, 12), pch = 19, cex = 0.5)

# dev.off()
table(emailDF$numAtt, isSpamLabs)
#> isSpamLabs
#> ham spam
#> 0 6624 2158
#> 1 314 230
#> 2 11 6
#> 4 0 1
#> 5 1 2
#> 18 1 0
# pdf('SPAM_mosaicPlots.pdf', width = 8, height = 4)
oldPar = par(mfrow = c(1, 2), mar = c(1, 1, 1, 1))
colM = c("#E41A1C80", "#377EB880")
isRe = factor(emailDF$isRe, labels = c("no Re:", "Re:"))
mosaicplot(table(isSpamLabs, isRe), main = "", xlab = "", ylab = "", color = colM)
fromNE = factor(emailDF$numEnd, labels = c("No #", "#"))
mosaicplot(table(isSpamLabs, fromNE), color = colM, main = "", xlab = "", ylab = "")

par(oldPar)
# dev.off()
library(rpart)
setupRpart = function(data) {
logicalVars = which(sapply(data, is.logical))
facVars = lapply(data[, logicalVars], function(x) {
x = as.factor(x)
levels(x) = c("F", "T")
x
})
cbind(facVars, data[, -logicalVars])
}
emailDFrp = setupRpart(emailDF)
set.seed(418910)
testSpamIdx = sample(numSpam, size = floor(numSpam/3))
testHamIdx = sample(numHam, size = floor(numHam/3))
testDF = rbind(emailDFrp[emailDFrp$isSpam == "T", ][testSpamIdx, ], emailDFrp[emailDFrp$isSpam ==
"F", ][testHamIdx, ])
trainDF = rbind(emailDFrp[emailDFrp$isSpam == "T", ][-testSpamIdx, ], emailDFrp[emailDFrp$isSpam ==
"F", ][-testHamIdx, ])
rpartFit = rpart(isSpam ~ ., data = trainDF, method = "class")
library(rpart.plot)
library(RColorBrewer)
# prp(rpartFit, extra = 1)
# pdf('SPAM_rpartTree.pdf', width = 7, height = 7)
rpart.plot(rpartFit, extra = 1)

# dev.off()
predictions = predict(rpartFit, newdata = testDF[, names(testDF) != "isSpam"],
type = "class")
predsForHam = predictions[testDF$isSpam == "F"]
summary(predsForHam)
#> F T
#> 2192 125
sum(predsForHam == "T")/length(predsForHam)
#> [1] 0.054
predsForSpam = predictions[testDF$isSpam == "T"]
sum(predsForSpam == "F")/length(predsForSpam)
#> [1] 0.16
complexityVals = c(seq(1e-05, 1e-04, length = 19), seq(1e-04, 0.001, length = 19),
seq(0.001, 0.005, length = 9), seq(0.005, 0.01, length = 9))
fits = lapply(complexityVals, function(x) {
rpartObj = rpart(isSpam ~ ., data = trainDF, method = "class", control = rpart.control(cp = x))
predict(rpartObj, newdata = testDF[, names(testDF) != "isSpam"], type = "class")
})
spam = testDF$isSpam == "T"
numSpam = sum(spam)
numHam = sum(!spam)
errs = sapply(fits, function(preds) {
typeI = sum(preds[!spam] == "T")/numHam
typeII = sum(preds[spam] == "F")/numSpam
c(typeI = typeI, typeII = typeII)
})
# pdf('SPAM_rpartTypeIandII.pdf', width = 8, height = 7)
library(RColorBrewer)
cols = brewer.pal(9, "Set1")[c(3, 4, 5)]
plot(errs[1, ] ~ complexityVals, type = "l", col = cols[2], lwd = 2, ylim = c(0,
0.2), xlim = c(0, 0.01), ylab = "Error", xlab = "complexity parameter values")
points(errs[2, ] ~ complexityVals, type = "l", col = cols[1], lwd = 2)
text(x = c(0.003, 0.0035), y = c(0.12, 0.05), labels = c("Type II Error", "Type I Error"))
minI = which(errs[1, ] == min(errs[1, ]))[1]
abline(v = complexityVals[minI], col = "grey", lty = 3, lwd = 2)
text(7e-04, errs[1, minI] + 0.01, formatC(errs[1, minI], digits = 2))
text(7e-04, errs[2, minI] + 0.01, formatC(errs[2, minI], digits = 3))

save(emailDFrp, file = "data.Rda")
library(caret)
setupRnum = function(data) {
logicalVars = which(sapply(data, is.logical))
facVars = lapply(data[, logicalVars], function(x) {
x = as.numeric(x)
})
cbind(facVars, data[, -logicalVars])
}
emailDFnum = setupRnum(emailDF)
emailDFnum[is.na(emailDFnum)] <- 0
cv_folds <- createFolds(emailDFnum$isSpam, k = 5, list = TRUE, returnTrain = TRUE)
lengths(cv_folds)
#> Fold1 Fold2 Fold3 Fold4 Fold5
#> 7478 7478 7479 7479 7478
library(MLmetrics)
f1 <- function(data, lev = NULL, model = NULL) {
f1_val <- F1_Score(y_pred = data$pred, y_true = data$obs, positive = lev[1])
p <- Precision(y_pred = data$pred, y_true = data$obs, positive = lev[1])
r <- Recall(y_pred = data$pred, y_true = data$obs, positive = lev[1])
fp <- sum(data$pred == 0 & data$obs == 1)/length(data$pred)
fn <- sum(data$pred == 1 & data$obs == 0)/length(data$pred)
c(F1 = f1_val, prec = p, rec = r, Type_I_err = fp, Type_II_err = fn)
}
library(naivebayes)
library(e1071)
nb_grid <- expand.grid(laplace = c(0, 0.1, 0.3, 0.5, 1), usekernel = c(T, F),
adjust = c(T, F))
train_control <- trainControl(method = "cv", number = 3, savePredictions = "final",
summaryFunction = f1)
model_nb <- caret::train(as.factor(isSpam) ~ ., data = emailDFnum, trControl = train_control,
method = "naive_bayes", tuneGrid = nb_grid)
model_nb
#> Naive Bayes
#>
#> 9348 samples
#> 29 predictor
#> 2 classes: '0', '1'
#>
#> No pre-processing
#> Resampling: Cross-Validated (3 fold)
#> Summary of sample sizes: 6232, 6232, 6232
#> Resampling results across tuning parameters:
#>
#> laplace usekernel adjust F1 prec rec Type_I_err Type_II_err
#> 0.0 FALSE FALSE 0.91 0.94 0.89 0.045 0.0817
#> 0.0 FALSE TRUE 0.91 0.94 0.89 0.045 0.0817
#> 0.0 TRUE FALSE NaN NaN NaN NaN NaN
#> 0.0 TRUE TRUE 0.89 0.81 1.00 0.178 0.0019
#> 0.1 FALSE FALSE 0.91 0.94 0.89 0.045 0.0817
#> 0.1 FALSE TRUE 0.91 0.94 0.89 0.045 0.0817
#> 0.1 TRUE FALSE NaN NaN NaN NaN NaN
#> 0.1 TRUE TRUE 0.89 0.81 1.00 0.178 0.0019
#> 0.3 FALSE FALSE 0.91 0.94 0.89 0.045 0.0817
#> 0.3 FALSE TRUE 0.91 0.94 0.89 0.045 0.0817
#> 0.3 TRUE FALSE NaN NaN NaN NaN NaN
#> 0.3 TRUE TRUE 0.89 0.81 1.00 0.178 0.0019
#> 0.5 FALSE FALSE 0.91 0.94 0.89 0.045 0.0817
#> 0.5 FALSE TRUE 0.91 0.94 0.89 0.045 0.0817
#> 0.5 TRUE FALSE NaN NaN NaN NaN NaN
#> 0.5 TRUE TRUE 0.89 0.81 1.00 0.178 0.0019
#> 1.0 FALSE FALSE 0.91 0.94 0.89 0.045 0.0817
#> 1.0 FALSE TRUE 0.91 0.94 0.89 0.045 0.0817
#> 1.0 TRUE FALSE NaN NaN NaN NaN NaN
#> 1.0 TRUE TRUE 0.89 0.81 1.00 0.178 0.0019
#>
#> F1 was used to select the optimal model using the largest value.
#> The final values used for the model were laplace = 0, usekernel =
#> FALSE and adjust = FALSE.